# This script details for calculatig the PRS using both PLINK (version 1.90) and PRSice.
# Finally, code for producing different plots of the PRS are presented. 
# To construct the PRS, the clumping and thresholding method was employed. But because a list of already independent SNPs was used for the PRS, clumping was not necessary. 

# Directory : ../../..

#######################################################################################################################################################################################
# Step 1: Identify the files needed for constructing the PRS

# Required files: 
#1# Target data:  genotype data containing information on all SNPs included in the score and the asthma phenotype - referred to in the code as: IOW_F1_imputed_QCed_PRS
#2# Phenotype data: already specified in fam file
#3# base data file: summary statistics used to construct the scores (effect sizes, p-value and standard error for each SNP). For PLINK, ORs were transformed OR to logOR. PRSice used untransformed OR.
#4# Covariates file: the first two principle components which were able to explain >80% of the variance in the data (for PRSice) - referred to in the code as: PRS_CEU_PC_covariates 
#5# File containing SNP IDs and corresponding p-values (for PLINK) - referred to in the code as: UKB_116SNP_PRS_pvalue
#6# Minor allele frequencies (for PLINK) - referred to in the code as: IOW_F1_imputed_QCed_PRS.frq
########################################################################################################################################################################################

#############
### PLINK ###
#############
# Calucate PRS
# --score should specify column numbers for: SNPID	effective_allele_info_(A1)	Beta
plink \
    --bfile IOW_F1_imputed_QCed_PRS \
    --keep-allele-order \
    --allow-no-sex \
    --score UKB_116_PRS_Final_Summary_Statistics_transformed 2 4 5 header sum \
    --read-freq IOW_F1_imputed_QCed_PRS.frq \
    --q-score-range range_list UKB_116SNP_PRS_pvalue \
    --out PLINK/IOW_UKB_116snp_PRS_sum
	
##############
### PRSice ###
##############
#module load R/3.6.1
cp /scratch/dk2e18/PRS/PRSice.R PRSice.R 
# Run PRSice with UKB summary stats with Final OR (inversed when A1 not matching IOW MA), adjusted for first two principle components
Rscript PRSice.R --dir . \
    --prsice /PRSice_linux \
    --base UKB_116_PRS_Final_Summary_Statistics_untransformed \
    --A1 A1 \
    --bp BP \
    --chr CHR \
    --pvalue P \
    --snp SNP \
    --stat OR \
    --or \
    --target IOW_F1_imputed_QCed_PRS \
    --thread 1 \
    --binary-target T \
    --cov PRS_CEU_PC_covariates \
    --cov-col @PC[1-2] \
    --quantile 5 \
    --quant-ref 1 \
    --no-clump \
    --upper 1.0\
    --lower 5e-8\
    --keep-ambig \
    --all-score \
    --score sum \
    --out /IOW_UKB_116snp_PRS_sum_PRSice_adj
	
	
####################################################################################################################################################################
####################################################################################################################################################################
# PRS Plots were constructed in R (version 3.6.1)

setwd("/../../..")

# Load packages
library(dplyr)
library(ggplot2)
library(plyr)
library(pROC)

# Load PRS output from PRSice
prs <- read.table("IOW_UKB_116snp_PRS_sum_PRSice_adj.best", header=TRUE)
pheno <- read.table("IOW_asthma_phenotype.txt", header=TRUE)
pheno$FID<-NULL
data <- left_join(prs, pheno, by="IID", fill="NA")
write.csv(data, file="PRS_116snp_Asthma10YR_Adjusted.csv", row.names=F) - data found in IOWBC_PRS_data.xlsx, sheet: IOWBC_PRS (n=924)  

# Remove those who had missing phenotype data and code phenotype variable to words.
X <- na.omit(data)
X$Asthma_10YR <- ifelse(X$Asthma_10YR=="1","No Asthma","Asthma")

### PRS Histogram ###
# Overlapping plot
mu <- ddply(X, "Asthma_10YR", summarise, grp.mean=mean(PRS))
head(mu)
# Asthma_10YR   grp.mean
#      Asthma  0.1301063
#   No Asthma -0.1083298

pdf("IOW_PRS_116snp_adjusted_group_overlapping_histogram.pdf")
ggplot(X, aes(x=PRS, fill=Asthma_10YR, color=Asthma_10YR)) +
  geom_histogram(position="identity", alpha=0.2)+
  geom_vline(data=mu, aes(xintercept=grp.mean, color=Asthma_10YR),
             linetype="dashed")+
  theme(legend.position="top")+
  theme_light() +
  labs(title="PRS Histogram",x="PRS", y = "Number of Individuals")
dev.off()

### ROC Plot ###
#read in data
IOW_PRS <- read.csv("PRS_116snp_Asthma10YR_Adjusted.csv", header=T)
IOW_PRS$FID <-NULL
IOW_PRS$In_Regression<-NULL

#set PHENO as factor
IOW_PRS$Asthma_10YR <- as.factor(IOW_PRS$Asthma_10YR)
IOW_PRS$Asthma_10YR <- ifelse(IOW_PRS$Asthma_10YR==1,0,1)

#plot roc curve and mesure AUC
pdf("IOW_PRS_116snp_unadjusted_ROC.pdf")
IOW_PRS_plot <- plot.roc(IOW_PRS$Asthma_10YR, IOW_PRS$PRS,main="PRS (unadjusted) ROC Plot", percent=TRUE, ci=TRUE, print.auc=TRUE, legacy.axes=TRUE)
ci_IOW_PRS_plot <- ci.se(IOW_PRS_plot, specificities=seq(0, 100, 5))
plot(ci_IOW_PRS_plot, type="shape", col="lightblue2", lty=0, lwd=2)
plot(ci(IOW_PRS_plot, of="thresholds",thresholds="best"))
dev.off()













